import torch

u_values = torch.randn(20).to('cuda')  # Generate a random tensor on GPU

# Original c function
def c_old(u_values, M=3, N=3, num_points=100, chunk_size=20):
    device = u_values.device
    t_values = torch.linspace(-M, N, num_points).to(device).unsqueeze(0)  # [1, num_points]
    b = -t_values / 2
    a = 1 / 2
    tensor_0 = (b / a).to(device)  # [1, num_points]
    dt = (M + N) / num_points

    result = torch.zeros_like(u_values).to(device)
    u_values_expanded = u_values.unsqueeze(1)  # [num_u_values, 1]

    for start in range(0, u_values.size(0), chunk_size):
        end = min(start + chunk_size, u_values.size(0))
        u_chunk = u_values_expanded[start:end]  # [chunk_size, 1]

        positive_mask = u_chunk >= 0
        if positive_mask.any():
            mask_positive = (t_values > 0) & (t_values <= u_chunk)  # [chunk_size, num_points]
            cumsum_positive = torch.cumsum(tensor_0 * mask_positive.float(), dim=1) * dt
            result[start:end][positive_mask.squeeze()] = cumsum_positive[:, -1][positive_mask.squeeze()]

        negative_mask = u_chunk < 0
        if negative_mask.any():
            mask_negative = (t_values <= 0) & (t_values > u_chunk)  # [chunk_size, num_points]
            cumsum_negative = torch.cumsum(tensor_0 * mask_negative.float(), dim=1) * dt
            result[start:end][negative_mask.squeeze()] = -cumsum_negative[:, -1][negative_mask.squeeze()]

    return result

# Updated c function
def c_new(u_values, M=3, N=3, num_points=100):
    device = u_values.device
    original_shape = u_values.shape
    u_values_flat = u_values.view(-1, u_values.size(-1))  # Flatten to (num_u_values, n1)
    
    t_values = torch.linspace(-M, N, num_points).to(device).unsqueeze(0)  # [1, num_points]
    b = -t_values / 2
    a = 1 / 2
    tensor_0 = (b / a).to(device)  # [1, num_points]
    dt = (M + N) / num_points
    
    u_values_expanded = u_values_flat.unsqueeze(2)  # [num_u_values, n1, 1]
    
    positive_mask = u_values_expanded >= 0
    negative_mask = u_values_expanded < 0
    
    mask_positive = (t_values > 0) & (t_values <= u_values_expanded)  # [num_u_values, n1, num_points]
    mask_negative = (t_values <= 0) & (t_values > u_values_expanded)  # [num_u_values, n1, num_points]
    
    cumsum_positive = torch.cumsum(tensor_0 * mask_positive.float(), dim=-1) * dt
    cumsum_negative = torch.cumsum(tensor_0 * mask_negative.float(), dim=-1) * dt
    
    result_positive = cumsum_positive[:, :, -1]  # Get the last column
    result_negative = -cumsum_negative[:, :, -1]  # Negative part result is negated
    
    result = torch.where(positive_mask[:, :, -1], result_positive, result_negative)
    
    result = result.view(original_shape)
    return result

# Call both functions to compute the result
result_old = c_old(u_values)
result_new = c_new(u_values)

# Check if both results are the same
are_equal = torch.allclose(result_old, result_new)
print(f"Are the results of both functions identical: {are_equal}")

